/*********************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/

/*********************************************************************
* 2014/07/28 Saar
*        BLASTEST               : OK
*        CTEST                  : OK
*        TEST                   : OK
*
* 2013/10/28 Saar
* Parameter:
*	SGEMM_DEFAULT_UNROLL_N	4
*	SGEMM_DEFAULT_UNROLL_M	16
*	SGEMM_DEFAULT_P		768
*	SGEMM_DEFAULT_Q		384
*	A_PR1			512
*	B_PR1			512
*	
* 
* 2014/07/28 Saar
* Performance at 9216x9216x9216:
*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
*
*********************************************************************/

#define ASSEMBLER
#include "common.h"
 
#define OLD_M	%rdi
#define OLD_N	%rsi
#define M	%r13
#define J	%r14
#define OLD_K	%rdx

#define A	%rcx
#define B	%r8
#define C	%r9
#define LDC	%r10
	
#define I	%r11
#define AO	%rdi
#define BO	%rsi
#define	CO1	%r15
#define K	%r12
#define BI	%rbp
#define BO2	%rbp
#define	SP	%rbx

#define BO1	%rdi
#define	CO2	%rdx

#ifndef WINDOWS_ABI

#define STACKSIZE 96

#else

#define STACKSIZE 256

#define OLD_A		40 + STACKSIZE(%rsp)
#define OLD_B		48 + STACKSIZE(%rsp)
#define OLD_C		56 + STACKSIZE(%rsp)
#define OLD_LDC		64 + STACKSIZE(%rsp)
#define OLD_OFFSET	72 + STACKSIZE(%rsp)

#endif

#if defined(OS_WINDOWS)
#define L_BUFFER_SIZE 8192
#else
#define L_BUFFER_SIZE 12288
#endif

#define Ndiv6	 24(%rsp)
#define Nmod6	 32(%rsp)
#define N	 40(%rsp)
#define ALPHA	 48(%rsp)
#define OFFSET	 56(%rsp)
#define KK	 64(%rsp)
#define KKK	 72(%rsp)
#define BUFFER1	           128(%rsp)

#if defined(OS_WINDOWS)
#if   L_BUFFER_SIZE > 16384
#define STACK_TOUCH \
        movl    $0,  4096 * 4(%rsp);\
        movl    $0,  4096 * 3(%rsp);\
        movl    $0,  4096 * 2(%rsp);\
        movl    $0,  4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 12288
#define STACK_TOUCH \
        movl    $0,  4096 * 3(%rsp);\
        movl    $0,  4096 * 2(%rsp);\
        movl    $0,  4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 8192
#define STACK_TOUCH \
        movl    $0,  4096 * 2(%rsp);\
        movl    $0,  4096 * 1(%rsp);
#elif L_BUFFER_SIZE > 4096
#define STACK_TOUCH \
        movl    $0,  4096 * 1(%rsp);
#else
#define STACK_TOUCH
#endif
#else
#define STACK_TOUCH
#endif

#if defined(BULLDOZER)

#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0

#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0

#else

#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0

#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0

#endif


#define	A_PR1	512
#define	B_PR1	512

/*******************************************************************************************
* 6 lines of N
*******************************************************************************************/

.macro KERNEL16x6_SUB
	vmovups 	-16 * SIZE(AO), %ymm0
	vmovups 	 -8 * SIZE(AO), %ymm1
	vbroadcastss	 -4 * SIZE(BO), %ymm2
	vbroadcastss	 -3 * SIZE(BO), %ymm3
	prefetcht0	A_PR1(AO)

	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )

	vbroadcastss	 -2 * SIZE(BO), %ymm2
	vbroadcastss	 -1 * SIZE(BO), %ymm3
	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )

	vbroadcastss	  0 * SIZE(BO), %ymm2
	vbroadcastss	  1 * SIZE(BO), %ymm3
	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
	VFMADD231PS_(  	%ymm13,%ymm2,%ymm1  )
	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
	VFMADD231PS_(  	%ymm15,%ymm3,%ymm1 )

	addq	$ 6*SIZE, BO 
	addq	$ 16*SIZE, AO 
	decq	%rax 
.endm

.macro SAVE16x6

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm5 , %ymm5
	vmulps	%ymm0 , %ymm6 , %ymm6
	vmulps	%ymm0 , %ymm7 , %ymm7
	vmulps	%ymm0 , %ymm8 , %ymm8
	vmulps	%ymm0 , %ymm9 , %ymm9
	vmulps	%ymm0 , %ymm10, %ymm10
	vmulps	%ymm0 , %ymm11, %ymm11
	vmulps	%ymm0 , %ymm12, %ymm12
	vmulps	%ymm0 , %ymm13, %ymm13
	vmulps	%ymm0 , %ymm14, %ymm14
	vmulps	%ymm0 , %ymm15, %ymm15


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps  8 * SIZE(CO1), %ymm5,%ymm5

	vaddps 	        (CO1, LDC), %ymm6,%ymm6
	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7

	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
	vaddps  8 * SIZE(CO1, LDC,2), %ymm9,%ymm9

	vaddps 	        (CO2), %ymm10,%ymm10
	vaddps  8 * SIZE(CO2), %ymm11,%ymm11

	vaddps 	        (CO2, LDC), %ymm12,%ymm12
	vaddps  8 * SIZE(CO2, LDC), %ymm13,%ymm13

	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
	vaddps  8 * SIZE(CO2, LDC,2), %ymm15,%ymm15

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm5 , 8 * SIZE(CO1)

	vmovups	%ymm6 ,  	(CO1, LDC)
	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)

	vmovups	%ymm8 ,  	(CO1, LDC,2)
	vmovups	%ymm9 , 8 * SIZE(CO1, LDC,2)

	vmovups	%ymm10,  	(CO2)
	vmovups	%ymm11, 8 * SIZE(CO2)

	vmovups	%ymm12,  	(CO2, LDC)
	vmovups	%ymm13, 8 * SIZE(CO2, LDC)

	vmovups	%ymm14,  	(CO2, LDC,2)
	vmovups	%ymm15, 8 * SIZE(CO2, LDC,2)

.endm



/*******************************************************************************************/

.macro KERNEL8x6_SUB
	vmovups 	-16 * SIZE(AO), %ymm0
	vbroadcastss	 -4 * SIZE(BO), %ymm2
	vbroadcastss	 -3 * SIZE(BO), %ymm3

	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )

	vbroadcastss	 -2 * SIZE(BO), %ymm2
	vbroadcastss	 -1 * SIZE(BO), %ymm3
	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )

	vbroadcastss	  0 * SIZE(BO), %ymm2
	vbroadcastss	  1 * SIZE(BO), %ymm3
	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )

	addq	$ 6*SIZE, BO 
	addq	$ 8*SIZE, AO 
	decq	%rax 
.endm

.macro SAVE8x6

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm6 , %ymm6
	vmulps	%ymm0 , %ymm8 , %ymm8
	vmulps	%ymm0 , %ymm10, %ymm10
	vmulps	%ymm0 , %ymm12, %ymm12
	vmulps	%ymm0 , %ymm14, %ymm14


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps 	        (CO1, LDC), %ymm6,%ymm6
	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
	vaddps 	        (CO2), %ymm10,%ymm10
	vaddps 	        (CO2, LDC), %ymm12,%ymm12
	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm6 ,  	(CO1, LDC)
	vmovups	%ymm8 ,  	(CO1, LDC,2)
	vmovups	%ymm10,  	(CO2)
	vmovups	%ymm12,  	(CO2, LDC)
	vmovups	%ymm14,  	(CO2, LDC,2)

.endm



/*******************************************************************************************/

.macro KERNEL4x6_SUB
	vmovups 	-16 * SIZE(AO), %xmm0
	vbroadcastss	 -4 * SIZE(BO), %xmm2
	vbroadcastss	 -3 * SIZE(BO), %xmm3

	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )

	vbroadcastss	 -2 * SIZE(BO), %xmm2
	vbroadcastss	 -1 * SIZE(BO), %xmm3
	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )

	vbroadcastss	  0 * SIZE(BO), %xmm2
	vbroadcastss	  1 * SIZE(BO), %xmm3
	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )

	addq	$ 6*SIZE, BO 
	addq	$ 4*SIZE, AO 
	decq	%rax 
.endm

.macro SAVE4x6

	vbroadcastss	ALPHA, %xmm0

	vmulps	%xmm0 , %xmm4 , %xmm4
	vmulps	%xmm0 , %xmm6 , %xmm6
	vmulps	%xmm0 , %xmm8 , %xmm8
	vmulps	%xmm0 , %xmm10, %xmm10
	vmulps	%xmm0 , %xmm12, %xmm12
	vmulps	%xmm0 , %xmm14, %xmm14


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %xmm4,%xmm4
	vaddps 	        (CO1, LDC), %xmm6,%xmm6
	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
	vaddps 	        (CO2), %xmm10,%xmm10
	vaddps 	        (CO2, LDC), %xmm12,%xmm12
	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14

#endif

	vmovups	%xmm4 ,  	(CO1)
	vmovups	%xmm6 ,  	(CO1, LDC)
	vmovups	%xmm8 ,  	(CO1, LDC,2)
	vmovups	%xmm10,  	(CO2)
	vmovups	%xmm12,  	(CO2, LDC)
	vmovups	%xmm14,  	(CO2, LDC,2)

.endm


/*******************************************************************************************/

.macro KERNEL2x6_SUB
	vmovss 	-16 * SIZE(AO), %xmm0
	vmovss 	-15 * SIZE(AO), %xmm1
	vmovss	 -4 * SIZE(BO), %xmm2
	vmovss	 -3 * SIZE(BO), %xmm3

	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )

	vmovss	 -2 * SIZE(BO), %xmm2
	vmovss	 -1 * SIZE(BO), %xmm3
	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )

	vmovss	  0 * SIZE(BO), %xmm2
	vmovss	  1 * SIZE(BO), %xmm3
	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )

	addq	$ 6*SIZE, BO 
	addq	$ 2*SIZE, AO 
	decq	%rax 
.endm

.macro SAVE2x6

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm5 , %xmm5
	vmulss	%xmm0 , %xmm6 , %xmm6
	vmulss	%xmm0 , %xmm7 , %xmm7
	vmulss	%xmm0 , %xmm8 , %xmm8
	vmulss	%xmm0 , %xmm9 , %xmm9
	vmulss	%xmm0 , %xmm10, %xmm10
	vmulss	%xmm0 , %xmm11, %xmm11
	vmulss	%xmm0 , %xmm12, %xmm12
	vmulss	%xmm0 , %xmm13, %xmm13
	vmulss	%xmm0 , %xmm14, %xmm14
	vmulss	%xmm0 , %xmm15, %xmm15


#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss  1 * SIZE(CO1), %xmm5,%xmm5

	vaddss 	        (CO1, LDC), %xmm6,%xmm6
	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7

	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9

	vaddss 	        (CO2), %xmm10,%xmm10
	vaddss  1 * SIZE(CO2), %xmm11,%xmm11

	vaddss 	        (CO2, LDC), %xmm12,%xmm12
	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13

	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm5 , 1 * SIZE(CO1)

	vmovss	%xmm6 ,  	(CO1, LDC)
	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)

	vmovss	%xmm8 ,  	(CO1, LDC,2)
	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)

	vmovss	%xmm10,  	(CO2)
	vmovss	%xmm11, 1 * SIZE(CO2)

	vmovss	%xmm12,  	(CO2, LDC)
	vmovss	%xmm13, 1 * SIZE(CO2, LDC)

	vmovss	%xmm14,  	(CO2, LDC,2)
	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)

.endm


/*******************************************************************************************/

.macro KERNEL1x6_SUB
	vmovss 	-16 * SIZE(AO), %xmm0
	vmovss	 -4 * SIZE(BO), %xmm2
	vmovss	 -3 * SIZE(BO), %xmm3

	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )

	vmovss	 -2 * SIZE(BO), %xmm2
	vmovss	 -1 * SIZE(BO), %xmm3
	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )

	vmovss	  0 * SIZE(BO), %xmm2
	vmovss	  1 * SIZE(BO), %xmm3
	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )

	addq	$ 6*SIZE, BO 
	addq	$ 1*SIZE, AO 
	decq	%rax 
.endm

.macro SAVE1x6

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm6 , %xmm6
	vmulss	%xmm0 , %xmm8 , %xmm8
	vmulss	%xmm0 , %xmm10, %xmm10
	vmulss	%xmm0 , %xmm12, %xmm12
	vmulss	%xmm0 , %xmm14, %xmm14

#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss 	        (CO1, LDC), %xmm6,%xmm6
	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
	vaddss 	        (CO2), %xmm10,%xmm10
	vaddss 	        (CO2, LDC), %xmm12,%xmm12
	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm6 ,  	(CO1, LDC)
	vmovss	%xmm8 ,  	(CO1, LDC,2)
	vmovss	%xmm10,  	(CO2)
	vmovss	%xmm12,  	(CO2, LDC)
	vmovss	%xmm14,  	(CO2, LDC,2)

.endm


/*******************************************************************************************/


/*******************************************************************************************
* 4 lines of N
*******************************************************************************************/

.macro KERNEL16x4_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
	addq	$ 4 , BI	
	addq	$ 16, %rax 
.endm

.macro SAVE16x4

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm5 , %ymm5
	vmulps	%ymm0 , %ymm6 , %ymm6
	vmulps	%ymm0 , %ymm7 , %ymm7
	vmulps	%ymm0 , %ymm8 , %ymm8
	vmulps	%ymm0 , %ymm9 , %ymm9
	vmulps	%ymm0 , %ymm10, %ymm10
	vmulps	%ymm0 , %ymm11, %ymm11


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps  8 * SIZE(CO1), %ymm5,%ymm5

	vaddps 	        (CO1, LDC), %ymm6,%ymm6
	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7

	vaddps 	        (CO2), %ymm8,%ymm8
	vaddps  8 * SIZE(CO2), %ymm9,%ymm9

	vaddps 	        (CO2, LDC), %ymm10,%ymm10
	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm5 , 8 * SIZE(CO1)

	vmovups	%ymm6 ,  	(CO1, LDC)
	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)

	vmovups	%ymm8 ,  	(CO2)
	vmovups	%ymm9 , 8 * SIZE(CO2)

	vmovups	%ymm10,  	(CO2, LDC)
	vmovups	%ymm11, 8 * SIZE(CO2, LDC)

	prefetcht0	64(CO1)
	prefetcht0	64(CO1, LDC)
	prefetcht0	64(CO2)
	prefetcht0	64(CO2, LDC)

.endm



/*******************************************************************************************/

.macro KERNEL8x4_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
	addq	$ 4 , BI	
	addq	$ 8 , %rax 
.endm

.macro SAVE8x4

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm6 , %ymm6
	vmulps	%ymm0 , %ymm8 , %ymm8
	vmulps	%ymm0 , %ymm10, %ymm10


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps 	        (CO1, LDC), %ymm6,%ymm6
	vaddps 	        (CO2), %ymm8,%ymm8
	vaddps 	        (CO2, LDC), %ymm10,%ymm10

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm6 ,  	(CO1, LDC)
	vmovups	%ymm8 ,  	(CO2)
	vmovups	%ymm10,  	(CO2, LDC)

.endm



/*******************************************************************************************/

.macro KERNEL4x4_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
	addq	$ 4 , BI	
	addq	$ 4 , %rax 
.endm

.macro SAVE4x4

	vbroadcastss	ALPHA, %xmm0

	vmulps	%xmm0 , %xmm4 , %xmm4
	vmulps	%xmm0 , %xmm6 , %xmm6
	vmulps	%xmm0 , %xmm8 , %xmm8
	vmulps	%xmm0 , %xmm10, %xmm10


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %xmm4,%xmm4
	vaddps 	        (CO1, LDC), %xmm6,%xmm6
	vaddps 	        (CO2), %xmm8,%xmm8
	vaddps 	        (CO2, LDC), %xmm10,%xmm10

#endif

	vmovups	%xmm4 ,  	(CO1)
	vmovups	%xmm6 ,  	(CO1, LDC)
	vmovups	%xmm8 ,  	(CO2)
	vmovups	%xmm10,  	(CO2, LDC)

.endm


/*******************************************************************************************/

.macro KERNEL2x4_SUB
	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
	addq	$ 4 , BI	
	addq	$ 2, %rax 
.endm

.macro SAVE2x4

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm5 , %xmm5
	vmulss	%xmm0 , %xmm6 , %xmm6
	vmulss	%xmm0 , %xmm7 , %xmm7
	vmulss	%xmm0 , %xmm8 , %xmm8
	vmulss	%xmm0 , %xmm9 , %xmm9
	vmulss	%xmm0 , %xmm10, %xmm10
	vmulss	%xmm0 , %xmm11, %xmm11


#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss  1 * SIZE(CO1), %xmm5,%xmm5

	vaddss 	        (CO1, LDC), %xmm6,%xmm6
	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7

	vaddss 	        (CO2), %xmm8,%xmm8
	vaddss  1 * SIZE(CO2), %xmm9,%xmm9

	vaddss 	        (CO2, LDC), %xmm10,%xmm10
	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm5 , 1 * SIZE(CO1)

	vmovss	%xmm6 ,  	(CO1, LDC)
	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)

	vmovss	%xmm8 ,  	(CO2)
	vmovss	%xmm9 , 1 * SIZE(CO2)

	vmovss	%xmm10,  	(CO2, LDC)
	vmovss	%xmm11, 1 * SIZE(CO2, LDC)

.endm


/*******************************************************************************************/

.macro KERNEL1x4_SUB
	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
	addq	$ 4 , BI	
	addq	$ 1, %rax 
.endm

.macro SAVE1x4

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm6 , %xmm6
	vmulss	%xmm0 , %xmm8 , %xmm8
	vmulss	%xmm0 , %xmm10, %xmm10


#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss 	        (CO1, LDC), %xmm6,%xmm6
	vaddss 	        (CO2), %xmm8,%xmm8
	vaddss 	        (CO2, LDC), %xmm10,%xmm10

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm6 ,  	(CO1, LDC)
	vmovss	%xmm8 ,  	(CO2)
	vmovss	%xmm10,  	(CO2, LDC)

.endm


/*******************************************************************************************/

/*******************************************************************************************
* 2 lines of N
*******************************************************************************************/

.macro KERNEL16x2_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
	addq	$ 2 , BI	
	addq	$ 16, %rax 
.endm

.macro SAVE16x2

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm5 , %ymm5
	vmulps	%ymm0 , %ymm6 , %ymm6
	vmulps	%ymm0 , %ymm7 , %ymm7


#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps  8 * SIZE(CO1), %ymm5,%ymm5

	vaddps 	        (CO1, LDC), %ymm6,%ymm6
	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm5 , 8 * SIZE(CO1)

	vmovups	%ymm6 ,  	(CO1, LDC)
	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)

.endm



/*******************************************************************************************/

.macro KERNEL8x2_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
	addq	$ 2 , BI	
	addq	$ 8 , %rax 
.endm

.macro SAVE8x2

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm6 , %ymm6

#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps 	        (CO1, LDC), %ymm6,%ymm6

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm6 ,  	(CO1, LDC)

.endm



/*******************************************************************************************/

.macro KERNEL4x2_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
	addq	$ 2 , BI	
	addq	$ 4 , %rax 
.endm

.macro SAVE4x2

	vbroadcastss	ALPHA, %xmm0

	vmulps	%xmm0 , %xmm4 , %xmm4
	vmulps	%xmm0 , %xmm6 , %xmm6

#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %xmm4,%xmm4
	vaddps 	        (CO1, LDC), %xmm6,%xmm6

#endif

	vmovups	%xmm4 ,  	(CO1)
	vmovups	%xmm6 ,  	(CO1, LDC)

.endm


/*******************************************************************************************/

.macro KERNEL2x2_SUB
	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
	addq	$ 2 , BI	
	addq	$ 2, %rax 
.endm

.macro SAVE2x2

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm5 , %xmm5
	vmulss	%xmm0 , %xmm6 , %xmm6
	vmulss	%xmm0 , %xmm7 , %xmm7


#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss  1 * SIZE(CO1), %xmm5,%xmm5

	vaddss 	        (CO1, LDC), %xmm6,%xmm6
	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm5 , 1 * SIZE(CO1)

	vmovss	%xmm6 ,  	(CO1, LDC)
	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)

.endm


/*******************************************************************************************/

.macro KERNEL1x2_SUB
	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
	addq	$ 2 , BI	
	addq	$ 1, %rax 
.endm

.macro SAVE1x2

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm6 , %xmm6

#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss 	        (CO1, LDC), %xmm6,%xmm6

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm6 ,  	(CO1, LDC)

.endm


/*******************************************************************************************/

/*******************************************************************************************
* 1 line of N
*******************************************************************************************/

.macro KERNEL16x1_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
	addq	$ 1 , BI	
	addq	$ 16, %rax 
.endm

.macro SAVE16x1

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4
	vmulps	%ymm0 , %ymm5 , %ymm5

#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4
	vaddps  8 * SIZE(CO1), %ymm5,%ymm5

#endif

	vmovups	%ymm4 ,  	(CO1)
	vmovups	%ymm5 , 8 * SIZE(CO1)

.endm


/*******************************************************************************************/

.macro KERNEL8x1_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
	addq	$ 1 , BI	
	addq	$ 8 , %rax 
.endm

.macro SAVE8x1

	vbroadcastss	ALPHA, %ymm0

	vmulps	%ymm0 , %ymm4 , %ymm4

#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %ymm4,%ymm4

#endif

	vmovups	%ymm4 ,  	(CO1)

.endm



/*******************************************************************************************/

.macro KERNEL4x1_SUB
	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
	addq	$ 1 , BI	
	addq	$ 4 , %rax 
.endm

.macro SAVE4x1

	vbroadcastss	ALPHA, %xmm0

	vmulps	%xmm0 , %xmm4 , %xmm4

#if !defined(TRMMKERNEL)

	vaddps 	        (CO1), %xmm4,%xmm4

#endif

	vmovups	%xmm4 ,  	(CO1)

.endm


/*******************************************************************************************/

.macro KERNEL2x1_SUB
	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
	addq	$ 1 , BI	
	addq	$ 2 , %rax 
.endm

.macro SAVE2x1

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4
	vmulss	%xmm0 , %xmm5 , %xmm5

#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4
	vaddss  1 * SIZE(CO1), %xmm5,%xmm5

#endif

	vmovss	%xmm4 ,  	(CO1)
	vmovss	%xmm5 , 1 * SIZE(CO1)

.endm


/*******************************************************************************************/

.macro KERNEL1x1_SUB
	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
	addq	$ 1 , BI	
	addq	$ 1 , %rax 
.endm

.macro SAVE1x1

	vmovss	ALPHA, %xmm0

	vmulss	%xmm0 , %xmm4 , %xmm4

#if !defined(TRMMKERNEL)

	vaddss 	        (CO1), %xmm4,%xmm4

#endif

	vmovss	%xmm4 ,  	(CO1)

.endm


/*******************************************************************************************/

#if !defined(TRMMKERNEL)

/*************************************************************************************
* GEMM Kernel
*************************************************************************************/


	PROLOGUE
	PROFCODE
	
	subq	$STACKSIZE, %rsp
	movq	%rbx,   (%rsp)
	movq	%rbp,  8(%rsp)
	movq	%r12, 16(%rsp)
	movq	%r13, 24(%rsp)
	movq	%r14, 32(%rsp)
	movq	%r15, 40(%rsp)

	vzeroupper

#ifdef WINDOWS_ABI
	movq	%rdi,    48(%rsp)
	movq	%rsi,    56(%rsp)
	movups	%xmm6,   64(%rsp)
	movups	%xmm7,   80(%rsp)
	movups	%xmm8,   96(%rsp)
	movups	%xmm9,  112(%rsp)
	movups	%xmm10, 128(%rsp)
	movups	%xmm11, 144(%rsp)
	movups	%xmm12, 160(%rsp)
	movups	%xmm13, 176(%rsp)
	movups	%xmm14, 192(%rsp)
	movups	%xmm15, 208(%rsp)

	movq	ARG1,      OLD_M
	movq	ARG2,      OLD_N
	movq	ARG3,      OLD_K
	movq	OLD_A,     A
	movq	OLD_B,     B
	movq	OLD_C,     C
	movq	OLD_LDC,   LDC
#ifdef TRMMKERNEL
	vmovsd	OLD_OFFSET, %xmm12
#endif
	vmovaps	%xmm3, %xmm0

#else
	movq	STACKSIZE +  8(%rsp), LDC
#ifdef TRMMKERNEL
	movsd	STACKSIZE + 16(%rsp), %xmm12
#endif

#endif

	movq    %rsp, SP      # save old stack
        subq    $128 + L_BUFFER_SIZE, %rsp
        andq    $-4096, %rsp    # align stack

        STACK_TOUCH

	cmpq	$0, OLD_M
	je	.L999

	cmpq	$0, OLD_N
	je	.L999

	cmpq	$0, OLD_K
	je	.L999

	movq	OLD_M, M
	movq	OLD_N, N
	movq	OLD_K, K

	vmovss	 %xmm0, ALPHA

	salq	$BASE_SHIFT, LDC

	movq    N, %rax
        xorq    %rdx, %rdx
        movq    $12,  %rdi
        divq    %rdi                    //    N / 12
        movq    %rax, Ndiv6             //    N / 12
        movq    %rdx, Nmod6             //    N % 12

	movq	Ndiv6,  J
	cmpq	$0, J
	je	.L4_00
	ALIGN_4


/*******************************************************************************************/

.L6_01:
	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	salq	$2, %rax		// 4 values of B
        leaq    (B, %rax,4), BO2
        movq    BO2, B                  // next offset of B
        movq    K, %rax

	ALIGN_4


.L6_02c:

	vmovups	(BO1), %xmm0
	vmovsd	(BO2), %xmm1
	vmovups	%xmm0, (BO)
	vmovsd	%xmm1, 4*SIZE(BO)
	addq	$ 4*SIZE,BO1
	addq	$ 4*SIZE,BO2
	addq	$ 6*SIZE,BO
	decq	%rax
	jnz	.L6_02c


.L6_10:
	movq	 C, CO1
	leaq	(C,   LDC, 2), CO2	
	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
	leaq	(C,   LDC, 4), C	
	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc

	movq	A, AO		 	// aoffset = a
	addq	$ 16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L6_20

	ALIGN_4

.L6_11:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L6_16

	ALIGN_4

.L6_12:

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	je	.L6_16

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	je	.L6_16

	jmp	.L6_12
	ALIGN_4

.L6_16:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L6_19

	ALIGN_4

.L6_17:

	KERNEL16x6_SUB

	jnz	.L6_17
	ALIGN_4


.L6_19:

	SAVE16x6

	addq	$16 * SIZE, CO1		# coffset += 16
	addq	$16 * SIZE, CO2		# coffset += 16
	decq	I			# i --
	jg	.L6_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L6_20:
	// Test rest of M

	testq	$15, M
	jz	.L6_60		// to next 6 lines of N

	testq	$8, M		
	jz	.L6_21pre
	ALIGN_4

/**************************************************************************/

.L6_20_1:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L6_20_6

	ALIGN_4

.L6_20_2:

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	je	.L6_20_6

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	je	.L6_20_6

	jmp	.L6_20_2
	ALIGN_4

.L6_20_6:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L6_20_9

	ALIGN_4

.L6_20_7:

	KERNEL8x6_SUB

	jnz	.L6_20_7
	ALIGN_4


.L6_20_9:

	SAVE8x6

	addq	$8 * SIZE, CO1		# coffset += 8
	addq	$8 * SIZE, CO2		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L6_21pre:

	testq	$4, M		
	jz	.L6_30
	ALIGN_4

.L6_21:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L6_26

	ALIGN_4

.L6_22:

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	je	.L6_26

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	je	.L6_26

	jmp	.L6_22
	ALIGN_4

.L6_26:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L6_29

	ALIGN_4

.L6_27:

	KERNEL4x6_SUB

	jnz	.L6_27
	ALIGN_4


.L6_29:

	SAVE4x6

	addq	$4 * SIZE, CO1		# coffset += 4
	addq	$4 * SIZE, CO2		# coffset += 4
	ALIGN_4
	

.L6_30:
	testq	$2, M		
	jz	.L6_40

	ALIGN_4

.L6_31:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L6_36

	ALIGN_4

.L6_32:

	prefetcht0	A_PR1(AO)
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	je	.L6_36

	prefetcht0	A_PR1(AO)
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	je	.L6_36

	jmp	.L6_32
	ALIGN_4

.L6_36:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L6_39

	ALIGN_4

.L6_37:

	KERNEL2x6_SUB

	jnz	.L6_37
	ALIGN_4


.L6_39:

	SAVE2x6

	addq	$2 * SIZE, CO1		# coffset += 2
	addq	$2 * SIZE, CO2		# coffset += 2
	ALIGN_4

.L6_40:
	testq	$1, M		
	jz	.L6_60		// to next 4 lines of N

	ALIGN_4

.L6_41:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L6_46

	ALIGN_4

.L6_42:

	prefetcht0	A_PR1(AO)
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	je	.L6_46

	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	je	.L6_46

	jmp	.L6_42
	ALIGN_4

.L6_46:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L6_49

	ALIGN_4

.L6_47:

	KERNEL1x6_SUB

	jnz	.L6_47
	ALIGN_4


.L6_49:

	SAVE1x6

	addq	$1 * SIZE, CO1		# coffset += 1
	addq	$1 * SIZE, CO2		# coffset += 1
	ALIGN_4
	



	
.L6_60:


/*******************************************************************************************/


.L7_01:
	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	salq	$2, %rax		// 4 values of B
        leaq    (B, %rax,4), BO2
        movq    K, %rax

	ALIGN_4


.L7_02c:

	vmovsd	2*SIZE(BO1), %xmm0
	vmovups	      (BO2), %xmm1
	vmovsd	%xmm0, (BO)
	vmovups	%xmm1, 2*SIZE(BO)
	addq	$ 4*SIZE,BO1
	addq	$ 4*SIZE,BO2
	addq	$ 6*SIZE,BO
	decq	%rax
	jnz	.L7_02c

        movq    BO2, B                  // next offset of B

.L7_10:
	movq	 C, CO1
	leaq	(C,   LDC, 2), CO2	
	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
	leaq	(C,   LDC, 4), C	
	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc

	movq	A, AO		 	// aoffset = a
	addq	$ 16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L7_20

	ALIGN_4

.L7_11:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L7_16

	ALIGN_4

.L7_12:

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	je	.L7_16

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB
	KERNEL16x6_SUB

	je	.L7_16

	jmp	.L7_12
	ALIGN_4

.L7_16:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L7_19

	ALIGN_4

.L7_17:

	KERNEL16x6_SUB

	jnz	.L7_17
	ALIGN_4


.L7_19:

	SAVE16x6

	addq	$16 * SIZE, CO1		# coffset += 16
	addq	$16 * SIZE, CO2		# coffset += 16
	decq	I			# i --
	jg	.L7_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L7_20:
	// Test rest of M

	testq	$15, M
	jz	.L7_60		// to next 6 lines of N

	testq	$8, M		
	jz	.L7_21pre
	ALIGN_4

/**************************************************************************/

.L7_20_1:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L7_20_6

	ALIGN_4

.L7_20_2:

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	je	.L7_20_6

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB
	prefetcht0	A_PR1(AO)
	KERNEL8x6_SUB
	KERNEL8x6_SUB

	je	.L7_20_6

	jmp	.L7_20_2
	ALIGN_4

.L7_20_6:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L7_20_9

	ALIGN_4

.L7_20_7:

	KERNEL8x6_SUB

	jnz	.L7_20_7
	ALIGN_4


.L7_20_9:

	SAVE8x6

	addq	$8 * SIZE, CO1		# coffset += 8
	addq	$8 * SIZE, CO2		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L7_21pre:

	testq	$4, M		
	jz	.L7_30
	ALIGN_4

.L7_21:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L7_26

	ALIGN_4

.L7_22:

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	je	.L7_26

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	prefetcht0	A_PR1(AO)
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB
	KERNEL4x6_SUB

	je	.L7_26

	jmp	.L7_22
	ALIGN_4

.L7_26:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L7_29

	ALIGN_4

.L7_27:

	KERNEL4x6_SUB

	jnz	.L7_27
	ALIGN_4


.L7_29:

	SAVE4x6

	addq	$4 * SIZE, CO1		# coffset += 4
	addq	$4 * SIZE, CO2		# coffset += 4
	ALIGN_4
	

.L7_30:
	testq	$2, M		
	jz	.L7_40

	ALIGN_4

.L7_31:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L7_36

	ALIGN_4

.L7_32:

	prefetcht0	A_PR1(AO)
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	je	.L7_36

	prefetcht0	A_PR1(AO)
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB
	KERNEL2x6_SUB

	je	.L7_36

	jmp	.L7_32
	ALIGN_4

.L7_36:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L7_39

	ALIGN_4

.L7_37:

	KERNEL2x6_SUB

	jnz	.L7_37
	ALIGN_4


.L7_39:

	SAVE2x6

	addq	$2 * SIZE, CO1		# coffset += 2
	addq	$2 * SIZE, CO2		# coffset += 2
	ALIGN_4

.L7_40:
	testq	$1, M		
	jz	.L7_60		// to next 4 lines of N

	ALIGN_4

.L7_41:
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO

	vzeroall

        movq    K, %rax

	andq	$-8, %rax
	je	.L7_46

	ALIGN_4

.L7_42:

	prefetcht0	A_PR1(AO)
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	je	.L7_46

	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB
	KERNEL1x6_SUB

	je	.L7_46

	jmp	.L7_42
	ALIGN_4

.L7_46:
        movq    K, %rax

	andq	$7, %rax		# if (k & 1)
	je .L7_49

	ALIGN_4

.L7_47:

	KERNEL1x6_SUB

	jnz	.L7_47
	ALIGN_4


.L7_49:

	SAVE1x6

	addq	$1 * SIZE, CO1		# coffset += 1
	addq	$1 * SIZE, CO2		# coffset += 1
	ALIGN_4
	



	
.L7_60:

	decq	J			// j --
	jg	.L6_01			// next 12 lines of N




/*******************************************************************************************/
.L4_00:

 	movq    Nmod6,  J
        sarq    $2, J           // j = j / 4
        cmpq    $ 0, J
        je      .L2_00
        ALIGN_4


.L4_01:
	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	sarq	$2, %rax		// K / 4
	jz	.L4_01b
	ALIGN_4


.L4_01a:
        prefetcht0 512(BO1)
        prefetchw  512(BO)

	vmovups	       (BO1), %xmm0
	vmovups	 4*SIZE(BO1), %xmm1
	vmovups	 8*SIZE(BO1), %xmm2
	vmovups	12*SIZE(BO1), %xmm3

	vmovups	%xmm0,       (BO)
	vmovups	%xmm1, 4*SIZE(BO)
	vmovups	%xmm2, 8*SIZE(BO)
	vmovups	%xmm3,12*SIZE(BO)

	addq	$ 16*SIZE,BO1
	addq	$ 16*SIZE,BO
	decq	%rax
	jnz	.L4_01a


.L4_01b:

        movq    K, %rax
        andq    $3, %rax                // K % 4
        jz      .L4_02d
        ALIGN_4

.L4_02c:

	vmovups	(BO1), %xmm0
	vmovups	%xmm0, (BO)
	addq	$ 4*SIZE,BO1
	addq	$ 4*SIZE,BO
	decq	%rax
	jnz	.L4_02c

.L4_02d:

	movq	BO1, B			// next offset of B

.L4_10:
	movq	 C, CO1
	leaq	(C, LDC, 2), CO2	
	leaq	(C, LDC, 4), C		// c += 4 * ldc

#if defined(TRMMKERNEL) && defined(LEFT)
        movq    OFFSET, %rax
        movq    %rax, KK
#endif
	
	movq	A, AO		 	// aoffset = a
	addq	$ 16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L4_20

	ALIGN_4

.L4_11:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             	// first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $16, %rax	// number of values in AO
#else
        addq    $4, %rax	// number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L4_16
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_12:

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	je	.L4_16

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	je	.L4_16

	jmp	.L4_12
	ALIGN_4

.L4_16:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_19

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_17:

	KERNEL16x4_SUB

	jl	.L4_17
	ALIGN_4


.L4_19:

	SAVE16x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $16, KK				
#endif

	addq	$16 * SIZE, CO1		# coffset += 16
	addq	$16 * SIZE, CO2		# coffset += 16
	decq	I			# i --
	jg	.L4_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L4_20:
	// Test rest of M

	testq	$15, M
	jz	.L4_60		// to next 3 lines of N

	testq	$8, M		
	jz	.L4_21pre
	ALIGN_4

/**************************************************************************/

.L4_20_1:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $8, %rax        // number of values in A
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L4_20_6
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_20_2:

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	je	.L4_20_6

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	je	.L4_20_6

	jmp	.L4_20_2
	ALIGN_4

.L4_20_6:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_20_9

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_20_7:

	KERNEL8x4_SUB

	jl	.L4_20_7
	ALIGN_4


.L4_20_9:

	SAVE8x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $8, KK
#endif

	addq	$8 * SIZE, CO1		# coffset += 8
	addq	$8 * SIZE, CO2		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L4_21pre:

	testq	$4, M		
	jz	.L4_30
	ALIGN_4

.L4_21:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $4, %rax        // number of values in A
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L4_26
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_22:

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	je	.L4_26

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	je	.L4_26

	jmp	.L4_22
	ALIGN_4

.L4_26:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_29

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_27:

	KERNEL4x4_SUB

	jl	.L4_27
	ALIGN_4


.L4_29:

	SAVE4x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $4, KK
#endif

	addq	$4 * SIZE, CO1		# coffset += 4
	addq	$4 * SIZE, CO2		# coffset += 4
	ALIGN_4
	

.L4_30:
	testq	$2, M		
	jz	.L4_40

	ALIGN_4

.L4_31:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $2, %rax        // number of values in AO
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L4_36
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_32:

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	je	.L4_36

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	je	.L4_36

	jmp	.L4_32
	ALIGN_4

.L4_36:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_39

	movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
	
	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_37:

	KERNEL2x4_SUB

	jl	.L4_37
	ALIGN_4


.L4_39:

	SAVE2x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $2, KK
#endif

	addq	$2 * SIZE, CO1		# coffset += 2
	addq	$2 * SIZE, CO2		# coffset += 2
	ALIGN_4

.L4_40:
	testq	$1, M		
	jz	.L4_60		// to next 4 lines of N

	ALIGN_4

.L4_41:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $1, %rax        // number of values in AO
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax
	je	.L4_46
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_42:

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	je	.L4_46

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	je	.L4_46

	jmp	.L4_42
	ALIGN_4

.L4_46:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_49

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_47:

	KERNEL1x4_SUB

	jl	.L4_47
	ALIGN_4


.L4_49:

	SAVE1x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $1, KK
#endif

	addq	$1 * SIZE, CO1		# coffset += 1
	addq	$1 * SIZE, CO2		# coffset += 1
	ALIGN_4
	



	
.L4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
        addq    $4, KK
#endif

	decq	J			// j --
	jg	.L4_01			// next 4 lines of N



/*******************************************************************************************/
.L2_00:

	movq	Nmod6, J		
	andq	$3, J			// j % 4
	je	.L999

	movq	Nmod6, J		
	andq	$2, J			// j % 4
	je	.L1_0

.L2_01:

	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	sarq	$2, %rax		// K / 4
	jz	.L2_01b
	ALIGN_4

.L2_01a:

	vmovsd	      (BO1), %xmm0
	vmovsd	2*SIZE(BO1), %xmm1
	vmovsd	4*SIZE(BO1), %xmm2
	vmovsd	6*SIZE(BO1), %xmm3

	vmovsd	%xmm0,       (BO)
	vmovsd	%xmm1, 2*SIZE(BO)
	vmovsd	%xmm2, 4*SIZE(BO)
	vmovsd	%xmm3, 6*SIZE(BO)

	addq	$8*SIZE,BO1
	addq	$8*SIZE,BO
	decq	%rax
	jnz	.L2_01a


.L2_01b:

        movq    K, %rax
        andq    $3, %rax                // K % 4
        jz      .L2_02d
        ALIGN_4

.L2_02c:

	vmovsd 	(BO1), %xmm0
	vmovsd 	%xmm0, (BO)
	addq	$2*SIZE,BO1
	addq	$2*SIZE,BO
	decq	%rax
	jnz	.L2_02c

.L2_02d:

	movq	BO1, B			// next offset of B

.L2_10:
	movq	C, CO1
	leaq	(C, LDC, 2), C		// c += 2 * ldc

#if defined(TRMMKERNEL) && defined(LEFT)
        movq    OFFSET, %rax
        movq    %rax, KK
#endif
	
	movq	A, AO		 	// aoffset = a
	addq	$16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L2_20

	ALIGN_4

.L2_11:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $16, %rax	// number of values in AO
#else
        addq    $2, %rax	// number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L2_16
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_12:

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	je	.L2_16

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	je	.L2_16

	jmp	.L2_12
	ALIGN_4

.L2_16:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_19

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_17:

	KERNEL16x2_SUB

	jl	.L2_17
	ALIGN_4


.L2_19:

	SAVE16x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $16, KK				
#endif

	addq	$16 * SIZE, CO1		# coffset += 16
	decq	I			# i --
	jg	.L2_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L2_20:
	// Test rest of M

	testq	$15, M
	jz	.L2_60		// to next 2 lines of N

	testq	$8, M		
	jz	.L2_21pre
	ALIGN_4

/**************************************************************************/

.L2_20_1:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $8, %rax        // number of values in A
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L2_20_6
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_20_2:


	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	je	.L2_20_6

	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	je	.L2_20_6

	jmp	.L2_20_2
	ALIGN_4

.L2_20_6:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_20_9

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_20_7:

	KERNEL8x2_SUB

	jl	.L2_20_7
	ALIGN_4


.L2_20_9:

	SAVE8x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $8, KK
#endif

	addq	$8 * SIZE, CO1		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L2_21pre:

	testq	$4, M		
	jz	.L2_30
	ALIGN_4

.L2_21:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $4, %rax        // number of values in A
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L2_26
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_22:


	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	je	.L2_26

	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	je	.L2_26

	jmp	.L2_22
	ALIGN_4

.L2_26:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_29

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_27:

	KERNEL4x2_SUB

	jl	.L2_27
	ALIGN_4


.L2_29:

	SAVE4x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $4, KK
#endif

	addq	$4 * SIZE, CO1		# coffset += 4
	ALIGN_4
	

.L2_30:
	testq	$2, M		
	jz	.L2_40

	ALIGN_4

.L2_31:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $2, %rax        // number of values in AO
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L2_36
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_32:

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	je	.L2_36

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	je	.L2_36

	jmp	.L2_32
	ALIGN_4

.L2_36:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_39

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
	
	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_37:

	KERNEL2x2_SUB

	jl	.L2_37
	ALIGN_4


.L2_39:

	SAVE2x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $2, KK
#endif

	addq	$2 * SIZE, CO1		# coffset += 2
	ALIGN_4

.L2_40:
	testq	$1, M		
	jz	.L2_60		// to next 2 lines of N

	ALIGN_4

.L2_41:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $1, %rax        // number of values in AO
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax
	je	.L2_46
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_42:

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	je	.L2_46

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	je	.L2_46

	jmp	.L2_42
	ALIGN_4

.L2_46:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_49

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_47:

	KERNEL1x2_SUB

	jl	.L2_47
	ALIGN_4


.L2_49:

	SAVE1x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $1, KK
#endif

	addq	$1 * SIZE, CO1		# coffset += 1
	ALIGN_4
	



	
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
        addq    $2, KK
#endif




.L1_0:

/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/

	movq	Nmod6, J		
	andq	$1, J			// j % 2
	je	.L999
	ALIGN_4

.L1_01:
	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	ALIGN_4

.L1_02b:

	vmovss	(BO1), %xmm0
	vmovss	%xmm0,       (BO)
	addq	$1*SIZE,BO1
	addq	$1*SIZE,BO
	decq	%rax
	jnz	.L1_02b

.L1_02c:

	movq	BO1, B			// next offset of B

.L1_10:
	movq	C, CO1
	leaq	(C, LDC, 1), C		// c += 1 * ldc

#if defined(TRMMKERNEL) && defined(LEFT)
        movq    OFFSET, %rax
        movq    %rax, KK
#endif
	
	movq	A, AO		 	// aoffset = a
	addq	$16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L1_20

	ALIGN_4

.L1_11:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $16, %rax	// number of values in AO
#else
        addq    $1, %rax	// number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L1_16
	movq    %rax, BI                        //  Index for BO

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_12:

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	je	.L1_16

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	je	.L1_16

	jmp	.L1_12
	ALIGN_4

.L1_16:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_19

	movq    %rax, BI                        //  Index for BO

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_17:

	KERNEL16x1_SUB

	jl	.L1_17
	ALIGN_4


.L1_19:

	SAVE16x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $16, KK				
#endif

	addq	$16 * SIZE, CO1		# coffset += 16
	decq	I			# i --
	jg	.L1_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L1_20:
	// Test rest of M

	testq	$15, M
	jz	.L999

	testq	$8, M		
	jz	.L1_21pre
	ALIGN_4

/**************************************************************************/

.L1_20_1:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $8, %rax        // number of values in A
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L1_20_6
	movq    %rax, BI                        //  Index for BO

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_20_2:

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	je	.L1_20_6

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	je	.L1_20_6

	jmp	.L1_20_2
	ALIGN_4

.L1_20_6:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_20_9

	movq    %rax, BI                        //  Index for BO

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_20_7:

	KERNEL8x1_SUB

	jl	.L1_20_7
	ALIGN_4


.L1_20_9:

	SAVE8x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $8, KK
#endif

	addq	$8 * SIZE, CO1		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L1_21pre:

	testq	$4, M		
	jz	.L1_30
	ALIGN_4

.L1_21:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $4, %rax        // number of values in A
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L1_26
	movq    %rax, BI                        //  Index for BO

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_22:

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	je	.L1_26

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	je	.L1_26

	jmp	.L1_22
	ALIGN_4

.L1_26:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_29

	movq    %rax, BI                        //  Index for BO

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_27:

	KERNEL4x1_SUB

	jl	.L1_27
	ALIGN_4


.L1_29:

	SAVE4x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $4, KK
#endif

	addq	$4 * SIZE, CO1		# coffset += 4
	ALIGN_4
	

.L1_30:
	testq	$2, M		
	jz	.L1_40

	ALIGN_4

.L1_31:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $2, %rax        // number of values in AO
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L1_36
	movq    %rax, BI                        //  Index for BO

	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_32:

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	je	.L1_36

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	je	.L1_36

	jmp	.L1_32
	ALIGN_4

.L1_36:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_39

	movq    %rax, BI                        //  Index for BO
	
	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_37:

	KERNEL2x1_SUB

	jl	.L1_37
	ALIGN_4


.L1_39:

	SAVE2x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $2, KK
#endif

	addq	$2 * SIZE, CO1		# coffset += 2
	ALIGN_4

.L1_40:
	testq	$1, M		
	jz	.L999

	ALIGN_4

.L1_41:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $1, %rax        // number of values in AO
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax
	je	.L1_46
	movq    %rax, BI                        //  Index for BO

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_42:

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	je	.L1_46

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	je	.L1_46

	jmp	.L1_42
	ALIGN_4

.L1_46:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_49

	movq    %rax, BI                        //  Index for BO

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_47:

	KERNEL1x1_SUB

	jl	.L1_47
	ALIGN_4


.L1_49:

	SAVE1x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $1, KK
#endif

	addq	$1 * SIZE, CO1		# coffset += 1
	ALIGN_4
	

.L999:
	movq   		SP, %rsp
	movq	   (%rsp), %rbx
	movq	  8(%rsp), %rbp
	movq	 16(%rsp), %r12
	movq	 24(%rsp), %r13
	movq	 32(%rsp), %r14
	movq	 40(%rsp), %r15

#ifdef WINDOWS_ABI
	movq	 48(%rsp), %rdi
	movq	 56(%rsp), %rsi
	movups	 64(%rsp), %xmm6
	movups	 80(%rsp), %xmm7
	movups	 96(%rsp), %xmm8
	movups	112(%rsp), %xmm9
	movups	128(%rsp), %xmm10
	movups	144(%rsp), %xmm11
	movups	160(%rsp), %xmm12
	movups	176(%rsp), %xmm13
	movups	192(%rsp), %xmm14
	movups	208(%rsp), %xmm15
#endif

	addq	$STACKSIZE, %rsp
	ret

	EPILOGUE



#else

/*************************************************************************************
* TRMM Kernel
*************************************************************************************/


	PROLOGUE
	PROFCODE
	
	subq	$STACKSIZE, %rsp
	movq	%rbx,   (%rsp)
	movq	%rbp,  8(%rsp)
	movq	%r12, 16(%rsp)
	movq	%r13, 24(%rsp)
	movq	%r14, 32(%rsp)
	movq	%r15, 40(%rsp)

	vzeroupper

#ifdef WINDOWS_ABI
	movq	%rdi,    48(%rsp)
	movq	%rsi,    56(%rsp)
	movups	%xmm6,   64(%rsp)
	movups	%xmm7,   80(%rsp)
	movups	%xmm8,   96(%rsp)
	movups	%xmm9,  112(%rsp)
	movups	%xmm10, 128(%rsp)
	movups	%xmm11, 144(%rsp)
	movups	%xmm12, 160(%rsp)
	movups	%xmm13, 176(%rsp)
	movups	%xmm14, 192(%rsp)
	movups	%xmm15, 208(%rsp)

	movq	ARG1,      OLD_M
	movq	ARG2,      OLD_N
	movq	ARG3,      OLD_K
	movq	OLD_A,     A
	movq	OLD_B,     B
	movq	OLD_C,     C
	movq	OLD_LDC,   LDC
#ifdef TRMMKERNEL
	vmovsd	OLD_OFFSET, %xmm12
#endif
	vmovaps	%xmm3, %xmm0

#else
	movq	STACKSIZE +  8(%rsp), LDC
#ifdef TRMMKERNEL
	movsd	STACKSIZE + 16(%rsp), %xmm12
#endif

#endif

	movq    %rsp, SP      # save old stack
        subq    $128 + L_BUFFER_SIZE, %rsp
        andq    $-4096, %rsp    # align stack

        STACK_TOUCH

	cmpq	$0, OLD_M
	je	.L999

	cmpq	$0, OLD_N
	je	.L999

	cmpq	$0, OLD_K
	je	.L999

	movq	OLD_M, M
	movq	OLD_N, N
	movq	OLD_K, K

	vmovss	 %xmm0, ALPHA

	salq	$BASE_SHIFT, LDC

	movq    N, %rax
        xorq    %rdx, %rdx
        movq    $4,  %rdi
        divq    %rdi                    //    N / 4
        movq    %rax, Ndiv6             //    N / 4
        movq    %rdx, Nmod6             //    N % 4

	

#ifdef TRMMKERNEL
	vmovsd	%xmm12, OFFSET
	vmovsd	%xmm12, KK
#ifndef LEFT
	negq	KK
#endif	
#endif

	movq	Ndiv6,  J
	cmpq	$0, J
	je	.L2_0
	ALIGN_4

/*******************************************************************************************/

.L4_01:
	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	sarq	$2, %rax		// K / 4
	jz	.L4_01b
	ALIGN_4


.L4_01a:
        prefetcht0 512(BO1)
        prefetchw  512(BO)

	vmovups	       (BO1), %xmm0
	vmovups	 4*SIZE(BO1), %xmm1
	vmovups	 8*SIZE(BO1), %xmm2
	vmovups	12*SIZE(BO1), %xmm3

	vmovups	%xmm0,       (BO)
	vmovups	%xmm1, 4*SIZE(BO)
	vmovups	%xmm2, 8*SIZE(BO)
	vmovups	%xmm3,12*SIZE(BO)

	addq	$ 16*SIZE,BO1
	addq	$ 16*SIZE,BO
	decq	%rax
	jnz	.L4_01a


.L4_01b:

        movq    K, %rax
        andq    $3, %rax                // K % 4
        jz      .L4_02d
        ALIGN_4

.L4_02c:

	vmovups	(BO1), %xmm0
	vmovups	%xmm0, (BO)
	addq	$ 4*SIZE,BO1
	addq	$ 4*SIZE,BO
	decq	%rax
	jnz	.L4_02c

.L4_02d:

	movq	BO1, B			// next offset of B

.L4_10:
	movq	 C, CO1
	leaq	(C, LDC, 2), CO2	
	leaq	(C, LDC, 4), C		// c += 4 * ldc

#if defined(TRMMKERNEL) && defined(LEFT)
        movq    OFFSET, %rax
        movq    %rax, KK
#endif
	
	movq	A, AO		 	// aoffset = a
	addq	$ 16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L4_20

	ALIGN_4

.L4_11:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             	// first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $16, %rax	// number of values in AO
#else
        addq    $4, %rax	// number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L4_16
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_12:

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	je	.L4_16

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	prefetcht0	A_PR1(AO, %rax, SIZE)
	prefetcht0	B_PR1(BO, BI  , SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB
	prefetcht0	A_PR1(AO, %rax, SIZE)
	KERNEL16x4_SUB

	je	.L4_16

	jmp	.L4_12
	ALIGN_4

.L4_16:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_19

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_17:

	KERNEL16x4_SUB

	jl	.L4_17
	ALIGN_4


.L4_19:

	SAVE16x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $16, KK				
#endif

	addq	$16 * SIZE, CO1		# coffset += 16
	addq	$16 * SIZE, CO2		# coffset += 16
	decq	I			# i --
	jg	.L4_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L4_20:
	// Test rest of M

	testq	$15, M
	jz	.L4_60		// to next 3 lines of N

	testq	$8, M		
	jz	.L4_21pre
	ALIGN_4

/**************************************************************************/

.L4_20_1:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $8, %rax        // number of values in A
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L4_20_6
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_20_2:

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	je	.L4_20_6

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB
	KERNEL8x4_SUB

	je	.L4_20_6

	jmp	.L4_20_2
	ALIGN_4

.L4_20_6:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_20_9

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_20_7:

	KERNEL8x4_SUB

	jl	.L4_20_7
	ALIGN_4


.L4_20_9:

	SAVE8x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $8, KK
#endif

	addq	$8 * SIZE, CO1		# coffset += 8
	addq	$8 * SIZE, CO2		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L4_21pre:

	testq	$4, M		
	jz	.L4_30
	ALIGN_4

.L4_21:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $4, %rax        // number of values in A
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L4_26
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_22:

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	je	.L4_26

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB
	KERNEL4x4_SUB

	je	.L4_26

	jmp	.L4_22
	ALIGN_4

.L4_26:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_29

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_27:

	KERNEL4x4_SUB

	jl	.L4_27
	ALIGN_4


.L4_29:

	SAVE4x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $4, KK
#endif

	addq	$4 * SIZE, CO1		# coffset += 4
	addq	$4 * SIZE, CO2		# coffset += 4
	ALIGN_4
	

.L4_30:
	testq	$2, M		
	jz	.L4_40

	ALIGN_4

.L4_31:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $2, %rax        // number of values in AO
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L4_36
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_32:

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	je	.L4_36

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB
	KERNEL2x4_SUB

	je	.L4_36

	jmp	.L4_32
	ALIGN_4

.L4_36:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_39

	movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
	
	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_37:

	KERNEL2x4_SUB

	jl	.L4_37
	ALIGN_4


.L4_39:

	SAVE2x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $2, KK
#endif

	addq	$2 * SIZE, CO1		# coffset += 2
	addq	$2 * SIZE, CO2		# coffset += 2
	ALIGN_4

.L4_40:
	testq	$1, M		
	jz	.L4_60		// to next 4 lines of N

	ALIGN_4

.L4_41:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $1, %rax        // number of values in AO
#else
        addq    $4, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax
	je	.L4_46
	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_42:

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	je	.L4_46

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB
	KERNEL1x4_SUB

	je	.L4_46

	jmp	.L4_42
	ALIGN_4

.L4_46:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L4_49

	movq    %rax, BI                        //  Index for BO
        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L4_47:

	KERNEL1x4_SUB

	jl	.L4_47
	ALIGN_4


.L4_49:

	SAVE1x4

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
        leaq    (BO, BI, SIZE), BO         
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $1, KK
#endif

	addq	$1 * SIZE, CO1		# coffset += 1
	addq	$1 * SIZE, CO2		# coffset += 1
	ALIGN_4
	



	
.L4_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
        addq    $4, KK
#endif

	decq	J			// j --
	jg	.L4_01			// next 4 lines of N



/*******************************************************************************************/
.L2_0:

	movq	Nmod6, J		
	andq	$3, J			// j % 4
	je	.L999

	movq	Nmod6, J		
	andq	$2, J			// j % 4
	je	.L1_0

.L2_01:

	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	sarq	$2, %rax		// K / 4
	jz	.L2_01b
	ALIGN_4

.L2_01a:

	vmovsd	      (BO1), %xmm0
	vmovsd	2*SIZE(BO1), %xmm1
	vmovsd	4*SIZE(BO1), %xmm2
	vmovsd	6*SIZE(BO1), %xmm3

	vmovsd	%xmm0,       (BO)
	vmovsd	%xmm1, 2*SIZE(BO)
	vmovsd	%xmm2, 4*SIZE(BO)
	vmovsd	%xmm3, 6*SIZE(BO)

	addq	$8*SIZE,BO1
	addq	$8*SIZE,BO
	decq	%rax
	jnz	.L2_01a


.L2_01b:

        movq    K, %rax
        andq    $3, %rax                // K % 4
        jz      .L2_02d
        ALIGN_4

.L2_02c:

	vmovsd 	(BO1), %xmm0
	vmovsd 	%xmm0, (BO)
	addq	$2*SIZE,BO1
	addq	$2*SIZE,BO
	decq	%rax
	jnz	.L2_02c

.L2_02d:

	movq	BO1, B			// next offset of B

.L2_10:
	movq	C, CO1
	leaq	(C, LDC, 2), C		// c += 2 * ldc

#if defined(TRMMKERNEL) && defined(LEFT)
        movq    OFFSET, %rax
        movq    %rax, KK
#endif
	
	movq	A, AO		 	// aoffset = a
	addq	$16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L2_20

	ALIGN_4

.L2_11:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $16, %rax	// number of values in AO
#else
        addq    $2, %rax	// number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L2_16
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_12:

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	je	.L2_16

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB
	KERNEL16x2_SUB

	je	.L2_16

	jmp	.L2_12
	ALIGN_4

.L2_16:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_19

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_17:

	KERNEL16x2_SUB

	jl	.L2_17
	ALIGN_4


.L2_19:

	SAVE16x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $16, KK				
#endif

	addq	$16 * SIZE, CO1		# coffset += 16
	decq	I			# i --
	jg	.L2_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L2_20:
	// Test rest of M

	testq	$15, M
	jz	.L2_60		// to next 2 lines of N

	testq	$8, M		
	jz	.L2_21pre
	ALIGN_4

/**************************************************************************/

.L2_20_1:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $8, %rax        // number of values in A
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L2_20_6
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_20_2:


	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	je	.L2_20_6

	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB
	KERNEL8x2_SUB

	je	.L2_20_6

	jmp	.L2_20_2
	ALIGN_4

.L2_20_6:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_20_9

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_20_7:

	KERNEL8x2_SUB

	jl	.L2_20_7
	ALIGN_4


.L2_20_9:

	SAVE8x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $8, KK
#endif

	addq	$8 * SIZE, CO1		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L2_21pre:

	testq	$4, M		
	jz	.L2_30
	ALIGN_4

.L2_21:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $4, %rax        // number of values in A
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L2_26
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_22:


	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	je	.L2_26

	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB
	KERNEL4x2_SUB

	je	.L2_26

	jmp	.L2_22
	ALIGN_4

.L2_26:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_29

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_27:

	KERNEL4x2_SUB

	jl	.L2_27
	ALIGN_4


.L2_29:

	SAVE4x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $4, KK
#endif

	addq	$4 * SIZE, CO1		# coffset += 4
	ALIGN_4
	

.L2_30:
	testq	$2, M		
	jz	.L2_40

	ALIGN_4

.L2_31:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $2, %rax        // number of values in AO
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L2_36
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_32:

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	je	.L2_36

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB
	KERNEL2x2_SUB

	je	.L2_36

	jmp	.L2_32
	ALIGN_4

.L2_36:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_39

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
	
	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_37:

	KERNEL2x2_SUB

	jl	.L2_37
	ALIGN_4


.L2_39:

	SAVE2x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $2, KK
#endif

	addq	$2 * SIZE, CO1		# coffset += 2
	ALIGN_4

.L2_40:
	testq	$1, M		
	jz	.L2_60		// to next 2 lines of N

	ALIGN_4

.L2_41:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $1, %rax        // number of values in AO
#else
        addq    $2, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax
	je	.L2_46
	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_42:

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	je	.L2_46

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB
	KERNEL1x2_SUB

	je	.L2_46

	jmp	.L2_42
	ALIGN_4

.L2_46:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L2_49

	movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L2_47:

	KERNEL1x2_SUB

	jl	.L2_47
	ALIGN_4


.L2_49:

	SAVE1x2

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
        leaq    (BO, BI, SIZE), BO         
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $1, KK
#endif

	addq	$1 * SIZE, CO1		# coffset += 1
	ALIGN_4
	



	
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
        addq    $2, KK
#endif




.L1_0:

/************************************************************************************************
* Loop for Nmod6 % 2 > 0
*************************************************************************************************/

	movq	Nmod6, J		
	andq	$1, J			// j % 2
	je	.L999
	ALIGN_4

.L1_01:
	// copy to sub buffer
	movq	B, BO1
	leaq    BUFFER1, BO		// first buffer to BO
	movq	K, %rax
	ALIGN_4

.L1_02b:

	vmovss	(BO1), %xmm0
	vmovss	%xmm0,       (BO)
	addq	$1*SIZE,BO1
	addq	$1*SIZE,BO
	decq	%rax
	jnz	.L1_02b

.L1_02c:

	movq	BO1, B			// next offset of B

.L1_10:
	movq	C, CO1
	leaq	(C, LDC, 1), C		// c += 1 * ldc

#if defined(TRMMKERNEL) && defined(LEFT)
        movq    OFFSET, %rax
        movq    %rax, KK
#endif
	
	movq	A, AO		 	// aoffset = a
	addq	$16 * SIZE, AO

	movq	M,  I
	sarq	$4, I			// i = (m >> 4)
	je	.L1_20

	ALIGN_4

.L1_11:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $16, %rax	// number of values in AO
#else
        addq    $1, %rax	// number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax			//  K = K - ( K % 8 )
	je	.L1_16
	movq    %rax, BI                        //  Index for BO

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_12:

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	je	.L1_16

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB
	KERNEL16x1_SUB

	je	.L1_16

	jmp	.L1_12
	ALIGN_4

.L1_16:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_19

	movq    %rax, BI                        //  Index for BO

	salq	$4, %rax			// rax = rax * 16 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_17:

	KERNEL16x1_SUB

	jl	.L1_17
	ALIGN_4


.L1_19:

	SAVE16x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $4, %rax                        // rax = rax * 16 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $16, KK				
#endif

	addq	$16 * SIZE, CO1		# coffset += 16
	decq	I			# i --
	jg	.L1_11
	ALIGN_4	

/**************************************************************************
* Rest of M 
***************************************************************************/
.L1_20:
	// Test rest of M

	testq	$15, M
	jz	.L999

	testq	$8, M		
	jz	.L1_21pre
	ALIGN_4

/**************************************************************************/

.L1_20_1:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $8, %rax        // number of values in A
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L1_20_6
	movq    %rax, BI                        //  Index for BO

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_20_2:

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	je	.L1_20_6

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB
	KERNEL8x1_SUB

	je	.L1_20_6

	jmp	.L1_20_2
	ALIGN_4

.L1_20_6:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_20_9

	movq    %rax, BI                        //  Index for BO

	salq	$3, %rax			// rax = rax * 8 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_20_7:

	KERNEL8x1_SUB

	jl	.L1_20_7
	ALIGN_4


.L1_20_9:

	SAVE8x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        salq    $3, %rax                        // rax = rax * 8 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $8, KK
#endif

	addq	$8 * SIZE, CO1		# coffset += 8
	ALIGN_4
	


/**************************************************************************/

.L1_21pre:

	testq	$4, M		
	jz	.L1_30
	ALIGN_4

.L1_21:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $4, %rax        // number of values in A
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L1_26
	movq    %rax, BI                        //  Index for BO

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_22:

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	je	.L1_26

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB
	KERNEL4x1_SUB

	je	.L1_26

	jmp	.L1_22
	ALIGN_4

.L1_26:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_29

	movq    %rax, BI                        //  Index for BO

	salq	$2, %rax			// rax = rax * 4 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_27:

	KERNEL4x1_SUB

	jl	.L1_27
	ALIGN_4


.L1_29:

	SAVE4x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        salq    $2, %rax                        // rax = rax * 4 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $4, KK
#endif

	addq	$4 * SIZE, CO1		# coffset += 4
	ALIGN_4
	

.L1_30:
	testq	$2, M		
	jz	.L1_40

	ALIGN_4

.L1_31:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $2, %rax        // number of values in AO
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif


	andq	$-8, %rax
	je	.L1_36
	movq    %rax, BI                        //  Index for BO

	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_32:

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	je	.L1_36

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB
	KERNEL2x1_SUB

	je	.L1_36

	jmp	.L1_32
	ALIGN_4

.L1_36:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_39

	movq    %rax, BI                        //  Index for BO
	
	salq	$1, %rax			// rax = rax *2 ; number of values
	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_37:

	KERNEL2x1_SUB

	jl	.L1_37
	ALIGN_4


.L1_39:

	SAVE2x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        salq    $1, %rax                        // rax = rax * 2 ; number of values
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $2, KK
#endif

	addq	$2 * SIZE, CO1		# coffset += 2
	ALIGN_4

.L1_40:
	testq	$1, M		
	jz	.L999

	ALIGN_4

.L1_41:
#if !defined(TRMMKERNEL) || \
        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
#else
        movq    KK, %rax
        leaq    BUFFER1, BO             // first buffer to BO
        addq    $4 * SIZE, BO
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO
        leaq    (AO, %rax, SIZE), AO
#endif


	vzeroall

#ifndef TRMMKERNEL
        movq    K, %rax
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
        movq    K, %rax
        subq    KK, %rax
        movq    %rax, KKK
#else
        movq    KK, %rax
#ifdef LEFT
        addq    $1, %rax        // number of values in AO
#else
        addq    $1, %rax        // number of values in BO
#endif
        movq    %rax, KKK
#endif

	andq	$-8, %rax
	je	.L1_46
	movq    %rax, BI                        //  Index for BO

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_42:

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	je	.L1_46

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB
	KERNEL1x1_SUB

	je	.L1_46

	jmp	.L1_42
	ALIGN_4

.L1_46:
#ifndef TRMMKERNEL
        movq    K, %rax
#else
        movq    KKK, %rax
#endif

	andq	$7, %rax		# if (k & 1)
	je .L1_49

	movq    %rax, BI                        //  Index for BO

	leaq	(AO, %rax, SIZE), AO
	leaq	(BO, BI, SIZE), BO
	negq	BI
	negq	%rax
	ALIGN_4

.L1_47:

	KERNEL1x1_SUB

	jl	.L1_47
	ALIGN_4


.L1_49:

	SAVE1x1

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
        movq    K, %rax 
        subq    KKK, %rax
        movq    %rax, BI                        //  Index for BO
        leaq    (BO, BI, SIZE), BO         
        leaq    (AO, %rax, SIZE), AO
#endif  


#if defined(TRMMKERNEL) && defined(LEFT)
        addq    $1, KK
#endif

	addq	$1 * SIZE, CO1		# coffset += 1
	ALIGN_4
	

.L999:
	movq   		SP, %rsp
	movq	   (%rsp), %rbx
	movq	  8(%rsp), %rbp
	movq	 16(%rsp), %r12
	movq	 24(%rsp), %r13
	movq	 32(%rsp), %r14
	movq	 40(%rsp), %r15

#ifdef WINDOWS_ABI
	movq	 48(%rsp), %rdi
	movq	 56(%rsp), %rsi
	movups	 64(%rsp), %xmm6
	movups	 80(%rsp), %xmm7
	movups	 96(%rsp), %xmm8
	movups	112(%rsp), %xmm9
	movups	128(%rsp), %xmm10
	movups	144(%rsp), %xmm11
	movups	160(%rsp), %xmm12
	movups	176(%rsp), %xmm13
	movups	192(%rsp), %xmm14
	movups	208(%rsp), %xmm15
#endif

	addq	$STACKSIZE, %rsp
	ret

	EPILOGUE


#endif

